/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: December 2024

Inputs: 	pdb_DataCollectionDetails.csv (downloaded 5/22/2018)

Outputs:	clean_collection.dta

*******************************************************************************/

clear all

import delimited using "${data_raw}PDB/pdb_DataCollectionDetails.csv", clear

* Rename some variables
rename structureid structureId
rename diffractionsource diffractionSource
rename collectiondate collectionDate
rename collectiontemperature collectionTemp
	
* Clean up some unusual collection dates
replace collectionDate = "1998-05" if collectionDate == "MAY-1998"
replace collectionDate = "1998-03" if collectionDate == "MAR-1998"
	
gen fix_collectionDate = length(collectionDate)
label def fix 10 "Full date" 7 "Month/Year only" 4 "Year only" 0 "Date Missing"
label values fix_collectionDate fix
replace collectionDate = collectionDate + "-01-01" if fix_collectionDate == 4
replace collectionDate = collectionDate + "-01" if fix_collectionDate == 7
assert length(collectionDate) == 10 | length(collectionDate) == 0

rename collectionDate collectionDate_s
gen collectionDate = date(collectionDate_s, "YMD")
drop collectionDate_s
format collectionDate  %td
	
duplicates tag structureId, gen(dup)
tab dup

* A small share of structures have multiple collection dates. Keep earliest
sort structureId collectionDate, stable
bys structureId: gen nn = _n	
keep if nn == 1
drop nn dup
isid structureId
	

 * Clean up diffraction source and mark location (continent)
split diffractionSource, limit(1)
tab diffractionSource1, sort mis
replace diffractionSource1 = upper(diffractionSource1)
	
 * Fix some names by hand:
replace diffractionSource1 = "EMBL/DESY" if diffractionSource1 == "EMBL/DESY,"
replace diffractionSource1 = "MPG/DESY" if diffractionSource1 == "MPG/DESY,"
replace diffractionSource1 = "EMBL/DESY" if diffractionSource1 == 			///
	"DESY-EMBL,HAMBURG"
replace diffractionSource1 = "NSLS" if diffractionSource1 == "NSLS-II"
replace diffractionSource1 = "EMBL/DESY" if diffractionSource1 == "DESY"
replace diffractionSource1 = "EMBL/DESY" if diffractionSource1 == 			///
	"DESY-EMBL,HAMBURG"
replace diffractionSource1 = "APS" if diffractionSource1 == "APS,"
replace diffractionSource1 = "ESRF" if diffractionSource1 == "ESRF,"
replace diffractionSource1 = "EMBL/DESY" if diffractionSource1 == "DESY/EMBL,"
replace diffractionSource1 = "MAX" if diffractionSource1 == "MAXLAB"
replace diffractionSource1 = "PAL/PLS" if diffractionSource1 == "POHANG"
replace diffractionSource1 = "SPRING-8" if diffractionSource1 == "SPING-8"
replace diffractionSource1 = "SPRING-8" if diffractionSource1 == "SPRING8"		
	
* Re-code the diffraction source as "home lab" if it is small
gen count = 1
bys diffractionSource1: egen size = sum(count)
gen lightSource = diffractionSource1 if size > 100
	
* These are the names of small rotating anode machines (no syncrotron used)
replace lightSource = "Other/HomeLab" if lightSource == ""
replace lightSource = "Other/HomeLab" if 									///
	inlist(diffractionSource1,"RIGAKU", "ENRAF-NONIUS", "OTHER", "BRUKER", 	///
	"SIEMENS", "MACSCIENCE", "ELLIOTT", "OXFORD")
	
drop size count diffractionSource1
	
 * Sort into continents to use as controls
gen syncLocation = "Other/NotSync"
replace syncLocation = "Europe" if inlist(lightSource, "ESRF","SLS",		///
	"DIAMOND","SOLEIL","BESSY","EMBL/DESY","SRS","MAX")
replace syncLocation = "North America" if inlist(lightSource, "APS","NSLS",	///
	"ALS","SSRL","CLSI")
replace syncLocation = "Asia" if inlist(lightSource,"SPRING-8","PHOTON",	///
	"SSRF","PAL/PLS","NSRRC")
	
	
save "${data_clean}clean_collection", replace
